In [1]:
## Importing Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import roc_auc_score, accuracy_score, auc, roc_curve, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
import yellowbrick
from yellowbrick.classifier import ROCAUC
%matplotlib inline
plt.style.use('ggplot')
In [2]:
df = pd.read_csv('Churn_Modelling.csv', low_memory=False)
In [3]:
df.head()
Out[3]:
In [4]:
df.info() ## seems like no missing values
In [5]:
plt.figure(figsize=(15,7))
sns.distplot(df['CreditScore'])
plt.title('Distribution of the Credit Score')
plt.show()
## Almost a Normal Distribution
In [6]:
## Logistic regression with just the the credit score and the Target
X = df[['CreditScore']]
y = df['Exited']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 25)
clr = LogisticRegression()
clr.fit(X_train, y_train)
clr.predict(X_test)
print("Accuracy of the model: {}".format(accuracy_score(y_test, clr.predict(X_test))))
print("10-fold cross validation accuracy of the model: {}".format(cross_val_score(clr,X_train, y_train, cv=10).mean()))
## We are able to get pretty good accuracy with just the credit scores
In [7]:
roc_auc_score(y_test, clr.predict(X_test)) ## Really bad
Out[7]:
In [8]:
plt.figure(figsize=(10,7))
probs = clr.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'LR AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
Out[8]:
In [9]:
plt.figure(figsize=(12,7))
plt.title('Geography Count Plot')
df['Geography'].value_counts().plot(kind='barh')
## Half the dataset is france
Out[9]:
In [10]:
plt.figure(figsize=(10,7))
plt.title('Gender Count Plot')
df['Gender'].value_counts().plot(kind='barh', color='green')
## Almost balanced
Out[10]:
In [11]:
## Question to answer how many are from france and have excited?
for i in df['Geography'].unique():
print(i)
print(20 * '--')
print(len(df[(df['Geography'] == i)]))
print(len(df[(df['Geography'] == i) & (df['Exited'] == 1)]) / len(df[(df['Geography'] == i)]))
print('\n')
## We can see that in Germany about 1/3 of the people exited
In [12]:
plt.figure(figsize=(10,7))
sns.countplot(df['Geography'], hue=df['Exited'])
Out[12]:
In [13]:
plt.figure(figsize=(10,7))
sns.countplot(df['Geography'], hue=df['Gender'])
Out[13]:
In [14]:
plt.figure(figsize=(10,7))
sns.kdeplot(df['Age'])
## Again almost a normal distribution!
Out[14]:
In [15]:
plt.figure(figsize=(10,7))
sns.kdeplot(df['EstimatedSalary'])
Out[15]:
In [16]:
df['Tenure'].describe()
Out[16]:
In [17]:
## Lets look at that turn out the be important features
In [18]:
rfe = RFE(RandomForestClassifier(n_estimators=1000), n_features_to_select=5)
In [19]:
X = df.iloc[:,3:-1]
y = df['Exited']
## Encode the categorical variables
X = pd.get_dummies(X, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 25)
In [20]:
rfe.fit(X_train,y_train)
Out[20]:
In [21]:
rfe.ranking_
Out[21]:
In [22]:
rfe.predict(X_test)
Out[22]:
In [23]:
roc_auc_score(y_test, rfe.predict(X_test)) ## Slightly better
Out[23]:
In [24]:
plt.figure(figsize=(10,7))
probs = rfe.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, rfe.predict(X_test))
roc_auc = auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'LR AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
Out[24]:
In [25]:
rf = RandomForestClassifier(n_estimators=1000)
In [26]:
rf.fit(X_train, y_train)
Out[26]:
In [27]:
rf.predict(X_test)
Out[27]:
In [28]:
roc_auc_score(y_test, rf.predict(X_test))
Out[28]:
In [29]:
plt.figure(figsize=(10,7))
probs = rfe.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, rf.predict(X_test))
roc_auc = auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'LR AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
Out[29]:
In [30]:
for i , j in zip(rf.feature_importances_, X.columns):
print(j,i)
In [31]:
rfe.ranking_
Out[31]:
In [32]:
## Okay good!
## Lets use Random Forest and build further, lets also look at the other metrics and see how the rf is doing
In [33]:
y_pred = rf.predict(X_test)
In [34]:
print("Accuracy of the model: {}".format(accuracy_score(y_test, y_pred)))
print("10-fold cross validation accuracy of the model: {}".format(cross_val_score(clr,X_train, y_train, cv=10).mean()))
In [35]:
confusion_matrix(y_test, y_pred)
Out[35]:
In [36]:
print(classification_report(y_test, y_pred))
In [37]:
## Ends here!!